The packages used in this project are: Rio: Chan et al. (2021) Readr: Wickham and Hester (2021) Haven: Wickham and Miller (2021)

Passengers Data

Load data

dat <- import(here("data", "dat.csv")) %>% 
   clean_names() %>% 
   mutate_all(na_if,"")

Clean data

dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$survived <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked  <- as.factor(dat$disembarked)

# # PEER Review MV: You could consider using dplyr to recode these new variables and then also creating factors that make more sense for some of the variables that have multiple variables. For example, numbering factors based on frequency for marital status. This probably isn't very helpful but a little cleaner than code above.
# dat <- dat %>%
#    mutate(gender = as.factor(gender),
#           marital_status = fct_infreq(marital_status),
#           category = as.factor(category),
#           class = as.factor(class),
#           survived = as.factor(survived),
#           embarked = as.factor(embarked),
#           disembarked = as.factor(disembarked))

dat <- dat %>% 
 mutate(nationality2 = case_when(nationality == "English" ~ "English",
   nationality == "Irish" ~ "Irish",
   nationality == "American" ~ "American",
   nationality == "Swedish" ~ "Swedish",
   nationality == "Finnish" ~ "Finnish",
   nationality == "Scottish" ~ "Scottish",
   nationality == "French" ~ "French",
   nationality == "Italian" ~ "Italian",
   nationality == "Canadian" ~ "Canadian",
   nationality == "Bulgarian" ~ "Bulgarian",
   nationality == "Croatian" ~ "Croatian",
   nationality == "Belgian" ~ "Belgian",
   nationality == "Norwegian" ~ "Norwegian",
   nationality == "Channel Islander" ~ "Channel Islander",
   nationality == "Welsh" ~ "Welsh",
   nationality == "Swiss" ~ "Swiss",
   nationality == "German" ~ "German",
   nationality == "Danish" ~ "Danish",
   nationality == "Spanish" ~ "Spanish",
   nationality == "Australian" ~ "Australian",
   nationality == "Polish" ~ "Polish",
   nationality == "South African" ~ "South African",
   nationality == "Bosnian" ~ "Bosnian",
   nationality == "Hong Kongese" ~ "Hong Kongese",
   nationality == "Dutch" ~ "Dutch",
   nationality == "Lithuanian" ~ "Lithuanian",
   nationality == "Greek" ~ "Greek",
   nationality == "Portuguese" ~ "Portuguese",
   nationality == "Uruguayan" ~ "Uruguayan",
   nationality == "Chinese" ~ "Chinese",
   nationality == "Slovenian" ~ "Slovenian",
   nationality == "Cape Verdean" ~ "Cape Verdean",
   nationality == "Egyptian" ~ "Egyptian",
   nationality == "Japanese" ~ "Japanese",
   nationality == "Hungarian" ~ "Hungarian",
   nationality == "Bosnian" ~ "Bosnian",
   nationality == "Hong Kongese" ~ "Hong Kongese",
   nationality == "Latvian" ~ "Latvian",
   nationality == "Austrian" ~ "Austrian",
   nationality == "Greek" ~ "Greek",
   nationality == "Mexican" ~ "Mexican",
   nationality == "Sweden" ~ "Swedish",
   nationality == "Turkish" ~ "Turkish",
   nationality == "Slovenian" ~ "Slovenian",
   nationality == "Guyanese" ~ "Guyanese",
   nationality == "Haitian" ~ "Haitian",
   nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
   nationality == "Unknown" ~ "Unknown",
   TRUE ~ "Other - Multiple", ))

## PEER Review MV: For Nationality, Consider creating a more collapsed factor variable that only has those nationalities with 10 or more individuals and then an other category. Then you could potentially create a cleaner bar graph. Added some code below
# dat <- dat %>%
#    mutate(nationality_cat = fct_lump_min(nationality,10),
#           nationality_cat = fct_infreq(nationality_cat))

dat <- dat %>% 
   mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))

datpass <- dat %>% 
   filter(category=="Passenger") %>% 
   select(survived, gender, class, age) %>% 
   na.omit()

Descriptives

# Breakdown of passengers by class and gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(gender)) %>% 
   group_by(class, gender) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>%
 kable(caption = "Breakdown of Passengers by Class and Gender",
       col.names = c("Class", "Gender", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Breakdown of Passengers by Class and Gender
Class Gender Count Percent
1st Class Female 153 43.71
1st Class Male 197 56.29
2nd Class Female 112 38.36
2nd Class Male 180 61.64
3rd Class Female 216 30.47
3rd Class Male 493 69.53
# Breakdown of passenger nationalities
dat %>% 
   filter(!is.na(nationality2)) %>% 
   group_by(nationality2) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(desc(percent)) %>%
 kable(caption = "Breakdown of Passenger Nationalities",
       col.names = c("Nationality", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Breakdown of Passenger Nationalities
Nationality Count Percent
English 1037 42.36
Irish 361 14.75
American 246 10.05
Other - Multiple 116 4.74
Swedish 100 4.08
Syrian/Lebanese 86 3.51
Finnish 58 2.37
Scottish 49 2.00
French 44 1.80
Italian 41 1.67
Canadian 39 1.59
Bulgarian 33 1.35
Croatian 28 1.14
Belgian 26 1.06
Norwegian 26 1.06
Channel Islander 25 1.02
Welsh 23 0.94
Swiss 22 0.90
German 14 0.57
Danish 11 0.45
Spanish 9 0.37
Australian 7 0.29
Polish 6 0.25
South African 5 0.20
Bosnian 4 0.16
Hong Kongese 4 0.16
Dutch 3 0.12
Greek 3 0.12
Lithuanian 3 0.12
Uruguayan 3 0.12
Chinese 2 0.08
Portuguese 2 0.08
Slovenian 2 0.08
Austrian 1 0.04
Cape Verdean 1 0.04
Egyptian 1 0.04
Guyanese 1 0.04
Haitian 1 0.04
Hungarian 1 0.04
Japanese 1 0.04
Latvian 1 0.04
Mexican 1 0.04
Turkish 1 0.04
# # PEER Review MV: - Consider visualizing some of your data that is currently in tables into bar graphs. This one doesn't have all the info on the table, but easier to see which passengers were most represented
# dat %>%
#    filter(!is.na(nationality2)) %>% 
#    group_by(nationality_cat) %>% 
#    summarize(count = n()) %>% 
#    mutate(percent = (count/sum(count))*100) %>% 
#    arrange((percent)) %>%
#    ggplot(aes(y = nationality_cat)) + 
#    geom_col(aes(x = percent), fill = "dark red") + 
#    geom_text(aes(x = percent, label=round(percent,2)), hjust = -.2, size = 3) +
#    theme_minimal() + 
#    labs(x = "Percentage of all passenges", y = "Nationality")

# Breakdown of passenger nationalities by class (all)
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(nationality2)) %>% 
   group_by(class, nationality2) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, desc(percent)) %>%
 kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
       col.names = c("Class", "Nationality", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Breakdown of Passenger Nationalities by Class (All)
Class Nationality Count Percent
1st Class American 195 57.35
1st Class English 51 15.00
1st Class Canadian 27 7.94
1st Class Other - Multiple 14 4.12
1st Class French 10 2.94
1st Class Irish 6 1.76
1st Class Swiss 6 1.76
1st Class German 5 1.47
1st Class Scottish 5 1.47
1st Class Spanish 4 1.18
1st Class Swedish 4 1.18
1st Class Uruguayan 3 0.88
1st Class Belgian 2 0.59
1st Class Italian 2 0.59
1st Class Channel Islander 1 0.29
1st Class Dutch 1 0.29
1st Class Egyptian 1 0.29
1st Class Mexican 1 0.29
1st Class Norwegian 1 0.29
1st Class Polish 1 0.29
2nd Class English 145 51.06
2nd Class Other - Multiple 25 8.80
2nd Class American 24 8.45
2nd Class Channel Islander 12 4.23
2nd Class Irish 12 4.23
2nd Class French 11 3.87
2nd Class Scottish 8 2.82
2nd Class Finnish 6 2.11
2nd Class Swedish 6 2.11
2nd Class Canadian 5 1.76
2nd Class South African 4 1.41
2nd Class Spanish 4 1.41
2nd Class Danish 3 1.06
2nd Class Italian 3 1.06
2nd Class Lithuanian 2 0.70
2nd Class Swiss 2 0.70
2nd Class Syrian/Lebanese 2 0.70
2nd Class Welsh 2 0.70
2nd Class Australian 1 0.35
2nd Class Belgian 1 0.35
2nd Class German 1 0.35
2nd Class Haitian 1 0.35
2nd Class Hungarian 1 0.35
2nd Class Japanese 1 0.35
2nd Class Norwegian 1 0.35
2nd Class Portuguese 1 0.35
3rd Class English 112 15.80
3rd Class Irish 105 14.81
3rd Class Swedish 90 12.69
3rd Class Syrian/Lebanese 83 11.71
3rd Class Other - Multiple 69 9.73
3rd Class Finnish 52 7.33
3rd Class Bulgarian 33 4.65
3rd Class Croatian 28 3.95
3rd Class Norwegian 24 3.39
3rd Class American 23 3.24
3rd Class Belgian 22 3.10
3rd Class Danish 7 0.99
3rd Class Scottish 6 0.85
3rd Class Welsh 6 0.85
3rd Class Canadian 5 0.71
3rd Class French 5 0.71
3rd Class Polish 5 0.71
3rd Class Swiss 5 0.71
3rd Class Bosnian 4 0.56
3rd Class Hong Kongese 4 0.56
3rd Class Italian 4 0.56
3rd Class Greek 3 0.42
3rd Class Channel Islander 2 0.28
3rd Class Chinese 2 0.28
3rd Class German 2 0.28
3rd Class Slovenian 2 0.28
3rd Class Australian 1 0.14
3rd Class Austrian 1 0.14
3rd Class Latvian 1 0.14
3rd Class Lithuanian 1 0.14
3rd Class Portuguese 1 0.14
3rd Class Turkish 1 0.14
# PEER Review MV: Here, I think another good opportunity to visualize tables that intersect Nationality and class. I think you are able to more easily see variation in nationality by class. Interesting how Americans were concentrated in first class, and third class varied much more.
# dat %>% 
#    filter(category == "Passenger") %>% 
#    filter(!is.na(nationality2)) %>% 
#    group_by(class, nationality_cat) %>% 
#    summarize(count = n()) %>% 
#    mutate(percent = (count/sum(count))*100) %>% 
#    arrange(class, desc(percent)) %>%
#    ggplot(aes(y = nationality_cat)) + 
#    geom_col(aes(x = percent), fill = "dark red") + 
#    facet_wrap(~fct_infreq(class)) +
#    geom_text(aes(x = percent, label=round(percent,2)), hjust = -.1, size = 3) +
#    theme_minimal() +
#    labs(x = "Percent of passengers by class", y = "Nationality")



# Average age by class
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   group_by(class) %>% 
   summarize(avg_age = mean(age), min_age = min(age), max_age = max(age)) %>%
 kable(caption = "Average Age by Class",
       col.names = c("Class", "Average Age", "Minimum Age", "Maximum Age"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Average Age by Class
Class Average Age Minimum Age Maximum Age
1st Class 39.12 0 71
2nd Class 30.01 0 71
3rd Class 25.12 0 74

Examining Survival

Survival rates

# Survival rate by class
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(class, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, survived) %>%
 kable(caption = "Survival Rate by Class",
       col.names = c("Class", "Survived", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Survival Rate by Class
Class Survived Count Percent
1st Class Lost 123 37.96
1st Class Saved 201 62.04
2nd Class Lost 166 58.45
2nd Class Saved 118 41.55
3rd Class Lost 528 74.47
3rd Class Saved 181 25.53
# Survival rate by gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(gender, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(gender, survived) %>%
 kable(caption = "Survival Rate by Gender",
       col.names = c("Gender", "Survived", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Survival Rate by Gender
Gender Survived Count Percent
Female Lost 127 27.25
Female Saved 339 72.75
Male Lost 690 81.08
Male Saved 161 18.92
# Survival rate by class and gender
dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(survived)) %>% 
   group_by(class, gender, survived) %>% 
   summarize(count = n()) %>% 
   mutate(percent = (count/sum(count))*100) %>% 
   arrange(class, gender) %>%
 kable(caption = "Survival Rate by Class and Gender",
       col.names = c("Class", "Gender", "Survived", "Count", "Percent"),
       digits = 2,
       booktabs = TRUE) %>%
 kable_styling()
Survival Rate by Class and Gender
Class Gender Survived Count Percent
1st Class Female Lost 5 3.47
1st Class Female Saved 139 96.53
1st Class Male Lost 118 65.56
1st Class Male Saved 62 34.44
2nd Class Female Lost 12 11.32
2nd Class Female Saved 94 88.68
2nd Class Male Lost 154 86.52
2nd Class Male Saved 24 13.48
3rd Class Female Lost 110 50.93
3rd Class Female Saved 106 49.07
3rd Class Male Lost 418 84.79
3rd Class Male Saved 75 15.21

Density ridges

## PEER Review MV: Cool graphs! 

surv_classhist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, class)) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Class", 
   x = "Age Distribution", y = "Passenger Class") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))

surv_agehist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, gender)) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Gender", 
   x = "Age Distribution", y = "Passenger Gender") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))

surv_ageclass_hist <- dat %>% 
   filter(category == "Passenger") %>% 
   filter(!is.na(age)) %>% 
   filter(!is.na(survived)) %>% 
   ggplot(aes(age, gender)) +
   facet_wrap(~class, nrow=3) +
   geom_density_ridges(aes(fill = factor(survived))) +
   labs(title = "Age Distribution of Survival Status By Class and Gender", 
   x = "Age Distribution", y = "Passenger Gender") +
   theme_minimal() +
   theme(plot.title = element_text(hjust = 0.5))

surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))

Predicting survival

ctree <- ctree(survived ~ gender + class + age, data=datpass)
## Note: We are aware the "saved" and "lost" labels are switched in the first graph and are working to figure out why.
plot(ctree)

# PEER Review MV: I am not at all familiar with the ctree command or what it is doing but it appears to be predicting survival rates for each combination of categories as you go down the flowcharts. However, I'm not sure why class appears twice at different levels. If I follow the left most line, it makes sense that female and first class would have highest probability of save rate. But not sure why class appears twice along the same hierarchy. There must be some reasoning behind this but it will be important to clearly explain this in the final write up.

ggparty(ctree) +
  geom_edge() +
  geom_edge_label() +
  geom_node_splitvar() +
  geom_node_plot(gglist = list(geom_bar(aes(x = "", fill = survived),
                                        position = position_fill()),
                               theme_bw(),
                               xlab("Survival"), ylab("Percentage")),
                 shared_axis_labels = TRUE,
                 legend_separator = TRUE,)

Tickets Prices

Load data

fares <- import(here("data", "avgfare.csv")) %>% 
   clean_names()

fares$accommodation <- as.factor(fares$accommodation)
fares$accommodation <- factor(fares$accommodation, levels = c("First-class parlor suite", "First-class cabin", "Second-class cabin", "Third-class cabin"))

Calculate inflation

## PEER Review MV: Would be great to know where these numbers are coming from, a little bit more annotation would be helpful.

p1921 <- (17.9/9.7)
fares$fare_1921 <- p1921*fares$fare_1912
fares$fare_1921 <- round(fares$fare_1921, 2)

p1931 <- (15.2/9.7)
fares$fare_1931 <- p1931*fares$fare_1912
fares$fare_1931 <- round(fares$fare_1931, 2)

p1941 <- (14.7/9.7)
fares$fare_1941 <- p1941*fares$fare_1912
fares$fare_1941 <- round(fares$fare_1941, 2)

p1951 <- (26.0/9.7)
fares$fare_1951 <- p1951*fares$fare_1912
fares$fare_1951 <- round(fares$fare_1951, 2)

p1961 <- (29.9/9.7)
fares$fare_1961 <- p1961*fares$fare_1912
fares$fare_1961 <- round(fares$fare_1961, 2)

p1971 <- (40.5/9.7)
fares$fare_1971 <- p1971*fares$fare_1912
fares$fare_1971 <- round(fares$fare_1971, 2)

p1981 <- (90.9/9.7)
fares$fare_1981 <- p1981*fares$fare_1912
fares$fare_1981 <- round(fares$fare_1981, 2)

p1991 <- (136.2/9.7)
fares$fare_1991 <- p1991*fares$fare_1912
fares$fare_1991 <- round(fares$fare_1991, 2)

p2001 <- (177.1/9.7)
fares$fare_2001 <- p2001*fares$fare_1912
fares$fare_2001 <- round(fares$fare_2001, 2)

p2011 <- (224.9/9.7)
fares$fare_2011 <- p2011*fares$fare_1912
fares$fare_2011 <- round(fares$fare_2011, 2)

p2021 <- (274.3/9.7)
fares$fare_2021 <- p2021*fares$fare_1912
fares$fare_2021 <- round(fares$fare_2021, 2)

Reshape data

fares_tidy <- fares %>%
 pivot_longer(cols = starts_with("fare"),
   names_to = "year",
   names_prefix = "fare_",
   values_to = "price", names_transform = list(year = as.integer))

Inflation-adjustment plot

# # PEER Review MV: Sweet graph!

fare_graph <- fares_tidy %>% 
   ggplot(aes(year, price, colour=accommodation)) +
   geom_line() +
   geom_point() +
   scale_colour_brewer(palette="Spectral") +
   facet_wrap(~ accommodation, 4, scales = "free") +
   xlim(1912,2021) +
   theme(panel.spacing = unit(1, "lines")) +
   labs(y = "Price ($USD)", x = "Year", title = "Inflation-Adjusted Titanic Ticket Prices", subtitle = "From 1912 to 2021", colour = "Accommodation") +
   theme_minimal()

ggplotly(fare_graph)

When taking inflation rates into consideration, we see that the average price for a first class cabin in 1912 was $150.00, which today would be $4,241.74

References

Chan, Chung-hong, Geoffrey CH Chan, Thomas J. Leeper, and Jason Becker. 2021. Rio: A Swiss-Army Knife for Data File i/o.
Wickham, Hadley, and Jim Hester. 2021. Readr: Read Rectangular Text Data. https://CRAN.R-project.org/package=readr.
Wickham, Hadley, and Evan Miller. 2021. Haven: Import and Export ’SPSS’, ’Stata’ and ’SAS’ Files. https://CRAN.R-project.org/package=haven.